x = read.csv('/sc/orga/projects/pd-omics/data/amp-pd/evan/amp-pd_metadata.csv', row.names = 1, header = T, check.names = F)
table(x$status, x$sex) Female Male
Case 289 491 Control 285 219
counts = read.table('/sc/orga/projects/pd-omics/data/amp-pd/evan/amp-pd_baseline_counts.txt', header = T, check.names = F)
mynd_metadata = x
counts = counts[rownames(mynd_metadata)]
tpms = read.table('/sc/orga/projects/pd-omics/data/amp-pd/evan/amp-pd_baseline_tpms.txt', header = T, check.names = F)
tpms = tpms[rownames(mynd_metadata)]barplot(mynd_metadata$PCT_CODING_BASES, names = rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("PCT CODING")
barplot(mynd_metadata$PCT_RIBOSOMAL_BASES, names=rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("PCT_RIBOSOMAL")
barplot(mynd_metadata$PCT_INTERGENIC_BASES, names=rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("INTERGENIC BASES")# You don't need to run everythin again, we can load the RData from here!
#load(paste0(work_dir, "mynd.expression.RData"))
### Removing genes with low expression
### >= 30%
x = data.frame(counts, check.names = F)
cpm = cpm(x)
keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x)
#These are the final tables of expression!
mynd_cpm = cpm[keep.exp, ]
keep = rowSums(tpms > 1) >= 0.3*ncol(x)
mynd_abundance = tpms[keep, ]
dim(mynd_abundance)[1] 18112 1284
mynd_counts = counts[keep.exp,]
# Normalization
norm = calcNormFactors(mynd_counts, method = "TMM") #normalized with TMM
#Creat Limma object
x <- DGEList(counts=mynd_counts, samples=mynd_metadata, norm.factors = norm)
v = voom(x)##Exploratory Analysis ### Library sizes
[1] 1284
[1] 2820424 73459397
[1] 23567225
#We can also plot the library sizes as a barplot to see whether there are any major discrepancies between the samples more easily.
#png(paste0(work_plots, "library_sizes.#png"), width = 16, height = 10, res = 300, units = "in")
barplot(sort(x$samples$lib.size), names=colnames(x), las=2, cex.axis = 0.8)
title("Barplot of library sizes")
hist(x$samples$lib.size)
#dev.off() ### PCA plots
par(mfrow=c(1,2))
#PCA
mynd_voom_t = t(mynd_voom)
autoplot(prcomp(mynd_voom_t), data = mynd_metadata, colour = "sex")
autoplot(prcomp(mynd_voom_t), data = mynd_metadata, colour = "status")
autoplot(prcomp(mynd_voom_t), data = x$samples, colour = "lib.size")
autoplot(prcomp(mynd_voom_t), data = x$samples, colour = "Plate")
#dev.off()#Only 2 samples
#png(paste0(work_plots, "Scatter_all.#png"), width = 8, height = 6, res = 300, units = "in")
par( mfrow = c( 2,2 ))
plot(log2(mynd_counts[,1:2] + 1), pch=16, cex=0.3, main = "COUNTS")
plot(log2(mynd_cpm[,1:2] + 1), pch=16, cex=0.3, main = "CPM")
plot(log2(mynd_abundance[,1:2] + 1), pch=16, cex=0.3, main = "TPM")
plot(mynd_voom[,1:2], pch=16, cex=0.3, main = "VOOM")
#dev.off()#Density plot with all samples
# cpm table = ppmi_cpm_case_control
# log cpm table = lcpm
lcounts = log2(mynd_counts)
lcpm = log2(mynd_cpm)
ltpm = log2(mynd_abundance)
# voom use log already
nsamples <- ncol(mynd_cpm)
#samplenames <- colnames(mynd_cpm)
colfunc <- colorRampPalette(c("#4DBBD5FF", "#3C5488FF"))
col = alpha(colfunc(nsamples), alpha = 0.1)
#png(paste0(work_plots, "Density_all.#png"), width = 8, height = 6, res = 300, units = "in")
par(mfrow=c(2,2))
#COUNTS
plot(density(lcounts[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. COUNTS ", xlab="Log2(counts)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcounts[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#CPM
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. CPM ", xlab="Log2(CPM)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#TPM
plot(density(ltpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="C. TPM ", xlab="Log2(TPM)")
#abline(v=ltpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(ltpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#Voom
plot(density(mynd_voom[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="D. VOOM ", xlab="voom")
#abline(v=lvoom.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(mynd_voom[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#dev.off()###Boxplot with log + 1 Only with few samples ###
#png(paste0(work_plots, "Boxplot_all.#png"), width = 8, height = 6, res = 300, units = "in")
#Counts
boxplot(log2(mynd_counts[, 1:300] +1), col=rainbow(20), main="Counts", ylab="Log2(Counts+1)",las=2,cex.axis=0.8)
#CPM
boxplot(log2(mynd_cpm[, 1:300] +1), col=rainbow(20), main="CPM", ylab="Log2(CPM+1)",las=2,cex.axis=0.8)
#TPM
boxplot(log2(mynd_abundance[, 1:300] +1), col=rainbow(20), main="TPM", ylab="Log2(TPM+1)",las=2,cex.axis=0.8)
#Voom
boxplot(mynd_voom[, 1:300], col=rainbow(20), main="Voom", ylab="Voom",las=2,cex.axis=0.8)
#dev.off()total = merge(mynd_voom_t, mynd_metadata, by = 0)
total$mismatch = ""
labelIndicesWeWant <- total$sex == "M" & total$ENSG00000229807.11 > 5 | total$sex == "F" & total$ENSG00000183878.15 > .2
total$mismatch[labelIndicesWeWant] <- as.character(total$Row.names[labelIndicesWeWant])
p = ggplot(total, aes(x = ENSG00000229807.11, y = ENSG00000183878.15, color = sex, label = mismatch))
p + geom_point() + geom_text_repel() + xlab("XIST") + ylab("UTY")